import os
import pandas as pd
import geopandas as gpd
import pickle
import altair as alt
import json
import shapely.wkt
from datetime import datetime
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
Future directions (did not get to):
...and more!
To perform, in terminal run
$ python process_geography.py filename pickle_name save_as_csv
The resulting dataframe is:
gdf = pickle.load(open('data/pickles/baltimore.pkl', 'rb'))
gdf.head()
Raw data:
df = pd.read_csv('data/sociology/911_Police_Calls_for_Service.csv')
df.head()
To perform, in terminal run
$ python geocode_calls.py dataset pickle_name directory pickle_dir
Geocoded data:
df = pd.read_csv('data/sociology/911_geocoded.csv')
df.head()
def normByPopulation(pops,dframe,regionCol):
standardizedCrimes = {}
for _,row in pops.iterrows():
name,pop = row['name'],row['population']
dframe[dframe[regionCol]==name]
try:
standardizedCrimes[name] = len(dframe[dframe[regionCol]==name]) / pop
except:
pass
mu = np.mean(list(standardizedCrimes.values()))
std = np.std(list(standardizedCrimes.values()))
normalizedCrimes = {i:(j-mu)/std for i,j in standardizedCrimes.items()}
return(pd.DataFrame(list(normalizedCrimes.items()),columns=['neighborhood','normalized_crime']))
populations = pickle.load(open('data/pickles/population.pkl','rb'))
neighCounts = {}
regionCol = 'neighborhood'
for region in set(df[regionCol]):
neighCounts[region] = len(df[df[regionCol]==region])
normalizedCrimes = normByPopulation(populations,df,'neighborhood')
normalizedCrimes.head()
def load_pickle(pickle_dir, pickle_name):
return pickle.load(open(os.path.join(pickle_dir, pickle_name), 'rb'))
def merge_data(attribute_column, geography, chloropleth, pickle_dir):
gdf = load_pickle(pickle_dir, geography)
chloropleth = load_pickle(pickle_dir, chloropleth)
chloropleth.columns = ['key', attribute_column]
return gdf.merge(chloropleth, on='key', how='left')
def prepare_for_altair(attribute_column, geography, chloropleth, pickle_dir='data/pickles'):
df = merge_data(attribute_column, geography, chloropleth, pickle_dir)
gdf = gpd.GeoDataFrame(df, crs={'init' :'epsg:4326'}, geometry='geometry')
gdf = gdf[['key', 'category', attribute_column, 'geometry']]
json_gdf = gdf.to_json()
json_features = json.loads(json_gdf)
return alt.Data(values=json_features['features'])
def plot_map_altair(attribute_column, geography, chloropleth, pickle_dir='data/pickles'):
data_geo = prepare_for_altair(attribute_column, geography, chloropleth, pickle_dir)
multi = alt.selection_multi()
highlight = alt.selection(type='single', on='mouseover',
fields=['symbol'], nearest=True)
return alt.Chart(data_geo).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
projection={'type': 'mercator'},
width=550,
height=600,
selection=multi
).encode(
color=alt.condition(multi, 'properties.' + attribute_column +':Q', alt.value('lightgray')),
tooltip=('properties.key:O', 'properties.' + attribute_column +':Q')
)
plot_map_altair('population','baltimore.pkl', 'population.pkl')